## DataViz 2.0 Workshop
## Part 2
library(tidyverse)
## ── Attaching packages ─────── tidyverse 1.3.0 ──
## ✓ ggplot2 3.2.1 ✓ purrr 0.3.3
## ✓ tibble 2.1.3 ✓ dplyr 0.8.3
## ✓ tidyr 1.0.0 ✓ stringr 1.4.0
## ✓ readr 1.3.1 ✓ forcats 0.4.0
## ── Conflicts ────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(ggpubr); library(ggrepel)
## Loading required package: magrittr
##
## Attaching package: 'magrittr'
## The following object is masked from 'package:purrr':
##
## set_names
## The following object is masked from 'package:tidyr':
##
## extract
## Data Import
gene_loc <- read.table("GSE69360.gene-locations.txt",
header = T)
## Plotting
scatter <- ggplot(gene_loc, aes(x=End-Start, y=Length, group=Chr, color=Chr)) +
geom_point()
scatter

### It is hard to visualize the entire data.
### Let's pretend we are only interested in a small set of chromosomes.
### Let's subset the data and add a few variables!
target <- c("chrX", "chrY", "chrM", "chr17")
gene_loc2 <- filter(gene_loc, Chr %in% target)
log_EndStart <- log10(gene_loc2$End-gene_loc2$Start)
log_length <- log10(gene_loc2$Length)
gene_loc2$log_length <- log_length
gene_loc2$log_EndStart <- log_EndStart
head(gene_loc2)
## Geneid Chr Start End Strand Length log_length log_EndStart
## 1 ENSG00000273288 chr17 4961 5048 - 88 1.944483 1.939519
## 2 ENSG00000272636 chr17 5810 6168 - 1480 3.170262 2.553883
## 3 ENSG00000273172 chr17 33615 34249 - 1185 3.073718 2.802089
## 4 ENSG00000181031 chr17 62293 63714 - 5953 3.774736 3.152594
## 5 ENSG00000262920 chr17 171183 171422 + 432 2.635484 2.378398
## 6 ENSG00000262061 chr17 180996 183279 + 2284 3.358696 3.358506
## Now let's check the new scatter plot... mmm still not the best
scatter <- ggplot(gene_loc2, aes(x = End-Start, y = Length, group=Chr, color=Chr)) +
geom_point()
scatter

## the gray background is annoying... remove it!
scatter <- ggplot(gene_loc2, aes(x = End-Start, y = Length, group=Chr, color=Chr)) +
geom_point() +
theme_bw()
scatter

### recap, try a different geometry .. by yourself!
box1 <- ggplot(gene_loc2, aes(x = Chr, y = Length, group=Chr, color=Chr)) +
geom_boxplot() +
theme_bw()
box1

## adjust the axes
scatter <- ggplot(gene_loc2 ,aes(x = End-Start, y = Length, group=Chr, color=Chr)) +
geom_point() +
theme_bw() +
xlim(0, 2500)+ ylim(0, 10000)
scatter
## Warning: Removed 392 rows containing missing values (geom_point).

### where are the green dots?
scatter3 <- ggplot(gene_loc2 ,aes(x = End-Start, y = Length, group=Chr, color=Chr)) +
geom_point(alpha = 0.7, size =0.5) +
theme_bw() +
xlim(0, 2500)+ ylim(0, 10000)
scatter3
## Warning: Removed 392 rows containing missing values (geom_point).

## Did it change? compare the plots side-by-side
ggarrange(scatter, scatter3,
labels = c("A", "B"),
ncol = 2, nrow = 1)
## Warning: Removed 392 rows containing missing values (geom_point).
## Warning: Removed 392 rows containing missing values (geom_point).

## transformed the axes.. Thats better.. isn't it?
trans_scatter <- scatter +
scale_x_log10("End-Start") +
scale_y_log10("Gene length") +
theme_minimal()
## Scale for 'x' is already present. Adding another scale for 'x', which will
## replace the existing scale.
## Scale for 'y' is already present. Adding another scale for 'y', which will
## replace the existing scale.
trans_scatter

## You want to add the regression lines.. So, lets do a multiple regression
scatter1 <- ggplot(gene_loc2, aes(x = log_EndStart, y = log_length, color=Chr)) +
geom_point() +
theme_bw() +
geom_smooth(method=lm, se=FALSE)
scatter1

## We can't see the lines clearly. Can you think of a solution?
scatter2 <- ggplot(gene_loc2, aes(x = log_EndStart, y = log_length, color=Chr)) +
geom_point(size =1, alpha = 0.2) +
geom_smooth(method=lm, se=FALSE) +
theme_bw()
scatter2

##Can you put them together in the same graph to compare?
ggarrange(scatter1, scatter2,
labels = c("A", "B"),
ncol = 2, nrow = 1)

## Now, lets add some numerical values to the graph. Like R^2
scatter <- ggplot(gene_loc2, aes(x = log_EndStart, y = log_length, color = Chr))+
geom_point() +
theme_bw() +
geom_smooth(method = lm, se = FALSE)+
ggpubr::stat_cor()
scatter

## Now, lets add some numerical values to the graph. Linear equation
scatter <- ggplot(gene_loc2, aes(x = log_EndStart, y = log_length, color = Chr))+
geom_point() +
geom_smooth(method = lm, se = FALSE)+
ggpubr::stat_regline_equation()
scatter

## Your boss wants to see the lines in different plots!
## multiple regression with equation and r2 different plots
ml_scatter <- ggscatter(gene_loc2, x="log_EndStart", y="log_length",
color = "Chr", palette = "jco",
add = "reg.line", add.params = list(color = "black")) +
facet_wrap(~Chr) +
stat_cor(label.y = 4.4) +
stat_regline_equation(label.y = 4.2)
ml_scatter

## labeling a point in a scatterplot..
scatter <- ggplot(gene_loc2 ,aes(x = End-Start, y = Length, group=Chr, color=Chr)) +
geom_point()
scatter

## that gene in the corner looks interesting!! What gene is it?
scatter <- ggplot(gene_loc2, aes(x = End-Start, y = Length, group=Chr, color=Chr)) +
geom_point()+
geom_text(label=gene_loc2$Geneid, size = 2, color="black")
scatter

## second example!! Laballing point and adding confidence interval to a regresion.
a <- gene_loc %>%
group_by(Chr) %>%
summarize(meanLength = mean(Length), numGenes = n())
head(a)
## # A tibble: 6 x 3
## Chr meanLength numGenes
## <fct> <dbl> <int>
## 1 chr1 2258. 5363
## 2 chr10 2160. 2260
## 3 chr11 2218. 3208
## 4 chr12 2342. 2818
## 5 chr13 1875. 1217
## 6 chr14 1892. 2244
scatter2 <- ggplot(a, aes(x = numGenes, y = meanLength)) +
geom_point()+
theme_bw()
scatter2

## which chromosome is represented by which point?
scatter2 <- ggplot(a, aes(x = numGenes, y = meanLength)) +
geom_point()+
theme_bw()+
geom_text(label=a$Chr, size = 2, color="black")
scatter2

## geom_text_repel is a better function!
scatter2 <- ggplot(a, aes(x = numGenes, y = meanLength)) +
geom_point()+
theme_bw()+
geom_text_repel(aes(label = Chr), color="red", segment.color="blue")
scatter2

## add confidence interval
scatter2 <- ggplot(a, aes(x = numGenes, y = meanLength)) +
geom_point()+
theme_bw()+
geom_text_repel(aes(label = Chr), color="red", segment.color="blue")+
geom_smooth(method = loess, color = "lightblue", alpha = 0.1)
scatter2
